load packages

library(knitr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(here)
## here() starts at C:/Users/edsqu/Documents/R/Portfolio

additional packages being used

library(tmap)
library(sf)
library(tidycensus)

load testing data

df1=readxl::read_excel(here("data","tot_tests_neighborhood.xlsx")) %>% 
  clean_names() %>%
  #mutate(date_reported= as_datetime(date_reported))
  separate(date_reported,into=c("date","hr"),sep = " ")%>%
  mutate(date_report=as_date(date))%>%
  group_by(neighborhood)%>%
  mutate(across(total_tests, ~ .-c(0,lag(.)[-1])))

Load Mapping Data

neigh=st_read(here("data","dc_neigh.geojson")) %>% clean_names()
## Reading layer `DC_Health_Planning_Neighborhoods' from data source 
##   `C:\Users\edsqu\Documents\R\Portfolio\data\dc_neigh.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 51 features and 6 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -77.11976 ymin: 38.79165 xmax: -76.9094 ymax: 38.99556
## Geodetic CRS:  WGS 84
class(neigh)
## [1] "sf"         "data.frame"

celect and clean data

df_tests=df1 %>%
  filter(as_date(date_report) == "2021/10/27") %>% 
  separate(neighborhood,into=c("code","name"),sep = ":") %>% 
  mutate(code=case_when(
    code=="N35" ~"N0",
    TRUE ~ code
  ))

join mapping data with test data

neigh2=left_join(neigh,df_tests,by=c("code")) 
tmap_mode("view")
tm_shape(neigh2) +tm_polygons("total_tests",alpha=.5)

load positive cases data

df_c=readxl::read_excel(here("data","neigh_cases.xlsx"),
                            col_types = c("date", "text", "numeric")) %>% 
  clean_names() 
df_cases=df_c %>%
  filter(as_date(date) == "2021/10/27") %>% 
  separate(neighborhood,into=c("code","name"),sep = ":") %>%
  mutate(code=case_when(
    code=="N35" ~"N0",
    TRUE ~ code
  ))

Join covid data

neigh3=left_join(neigh2,df_cases,by=c("code")) 
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(neigh3) +tm_polygons("total_positives",alpha=.5)

graph data

 tm_shape(neigh3)+tm_polygons(c("total_positives","total_tests"),alpha=.5)

get census Data

census_api_key("2c8b9d5c4902b7efb4e1f98b2c23692cb1b73e95")
## To install your API key for use in future sessions, run this function with `install = TRUE`.
v20 = load_variables(2018,"acs5")
df_cencus=get_acs(geography = "tract",
                  variables=c("pop"="B01001_001"),
                  state="DC",geometry=TRUE,year=2018) 
## Getting data from the 2014-2018 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |======================================================================| 100%
df_cens=df_cencus %>% select(-moe) %>% spread(variable,estimate) 
df_cens_adj=df_cens %>% st_transform(4326)

Join Census data to mapping data

df_j=st_join(df_cens_adj,neigh3,largest=TRUE)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries

Summarize population for each neighbourhood

df3=df_j %>% select(pop,code) %>%
  group_by(code) %>%
  summarise(pop_n=sum(pop))

Join population data and generate results

Removed any extreme outliers and generated map to compare testing rates and covid rates

df4=left_join(neigh3,df3 %>% st_set_geometry(NULL))
## Joining, by = "code"
df4=df4 %>% mutate( covid_rate=total_positives/pop_n, test_rate=total_tests/pop_n)
df4 %>% filter(!(code %in% c("N0","N15","N24"))) %>% tm_shape()+tm_polygons(c("covid_rate","test_rate"))

From the graphs above, it looks like covid rates and covid testing have no major relationship in the DC area. Further analysis with other variables will be taken as a class.

Reflection: This was our side of the final project. We found it easy to join our data with the neighborhoods data, as both data sets have the same neighborhoods. We figured this would be useless without adjusting for population. We then brought it the census data and compared covid test to covid cases. The maps we generated show that it is unlikely that they are strongly related. This document practiced manipulation and visualization skills, while developing the overall model as a class with multiple conditions from many data sets shows more of an overall data science process, as we are individually finding data, cleaning it and joining it. We will then bring it all together and build a model to interpret the data and hopefully draw some conclusions.